The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
User defined parameters
print(params)
## $run
## [1] TRUE
##
## $test_run
## [1] FALSE
##
## $save_figs
## [1] FALSE
##
## $ecoregion
## [1] "forest"
##
## $response
## [1] "TotalTreeCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
run <- params$run
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)
Data compiled in the prepDataForModels.R script
here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf.rds"))
## there are some values of the annual wet degree days 5th percentile that have -Inf?? change to lowest value for now?
modDat[is.infinite(modDat$annWetDegDays_5percentile_3yrAnom), "annWetDegDays_5percentile_3yrAnom"] <- -47.8
## same, but for annual water deficit 95th percentile
modDat[is.infinite(modDat$annWaterDeficit_95percentile_3yrAnom), "annWaterDeficit_95percentile_3yrAnom"] <- -600
# # Convert total cover variables into proportions (for later use in beta regression models) ... proportions are already scaled from zero to 1
# modDat <- modDat %>%
# mutate(TotalTreeCover = TotalTreeCover/100,
# CAMCover = CAMCover/100,
# TotalHerbaceousCover = TotalHerbaceousCover/100,
# BareGroundCover = BareGroundCover/100,
# ShrubCover = ShrubCover/100
# )
# For all response variables, make sure there are no 0s add or subtract .0001 from each, since the Gamma model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$CAMCover == 0 & !is.na(modDat$CAMCover), "CAMCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$BroadleavedTreeCover_prop == 0 & !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.0001
modDat[modDat$NeedleLeavedTreeCover_prop == 0 & !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.0001
modDat[modDat$C4Cover_prop == 0 & !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.0001
modDat[modDat$C3Cover_prop == 0 & !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001
#
# modDat[modDat$TotalTreeCover ==1& !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.999
# modDat[modDat$CAMCover ==1& !is.na(modDat$CAMCover), "CAMCover"] <- 0.999
# modDat[modDat$TotalHerbaceousCover ==1 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.999
# modDat[modDat$BareGroundCover ==1& !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.999
# modDat[modDat$ShrubCover ==1& !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.999
# modDat[modDat$BroadleavedTreeCover_prop ==1& !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.999
# modDat[modDat$NeedleLeavedTreeCover_prop ==1& !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.999
# modDat[modDat$C4Cover_prop ==1& !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.999
# modDat[modDat$C3Cover_prop ==1& !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.999
# modDat[modDat$ForbCover_prop ==1& !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.999
set.seed(1234)
modDat_1 <- modDat %>%
select(-c(prcp_annTotal:annVPD_min)) %>%
# mutate(Lon = st_coordinates(.)[,1],
# Lat = st_coordinates(.)[,2]) %>%
# st_drop_geometry() %>%
# filter(!is.na(newRegion))
rename("tmin" = tmin_meanAnnAvg_CLIM,
"tmax" = tmax_meanAnnAvg_CLIM, #1
"tmean" = tmean_meanAnnAvg_CLIM,
"prcp" = prcp_meanAnnTotal_CLIM,
"t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
"t_cold" = T_coldestMonth_meanAnnAvg_CLIM,
"prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
"prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM,
"prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
"prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM, #3
"abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM,
"isothermality" = isothermality_meanAnnAvg_CLIM, #4
"annWatDef" = annWaterDeficit_meanAnnAvg_CLIM,
"annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
"VPD_mean" = annVPD_mean_meanAnnAvg_CLIM,
"VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
"VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
"VPD_max_95" = annVPD_max_95percentile_CLIM,
"annWatDef_95" = annWaterDeficit_95percentile_CLIM,
"annWetDegDays_5" = annWetDegDays_5percentile_CLIM,
"frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM,
"frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM,
"soilDepth" = soilDepth, #7
"clay" = surfaceClay_perc,
"sand" = avgSandPerc_acrossDepth, #8
"coarse" = avgCoarsePerc_acrossDepth, #9
"carbon" = avgOrganicCarbonPerc_0_3cm, #10
"AWHC" = totalAvailableWaterHoldingCapacity,
## anomaly variables
tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom, #19
t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom, #22
prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23
prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25
isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom, #28
VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom, #30
VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom, #31
VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33
annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom , #34
frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35
frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
)
# small dataset for if testing the data
if(test_run) {
modDat_1 <- slice_sample(modDat_1, n = 1e5)
}
modDat_1[,response] <- modDat_1[,response]+2
ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is forest and the response variable is TotalTreeCover"
if (ecoregion == "shrubGrass") {
# select data for the ecoregion of interest
modDat_1 <- modDat_1 %>%
filter(newRegion == "dryShrubGrass")
} else if (ecoregion == "forest") {
# select data for the ecoregion of interest
modDat_1 <- modDat_1 %>%
filter(newRegion %in% c("eastForest", "westForest"))
}
# remove the rows that have no observations for the response variable of interest
modDat_1 <- modDat_1[!is.na(modDat_1[,response]),]
modDat_1_noLA <- modDat_1 %>%
filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")
modDat_1_LA <- modDat_1 %>%
filter(NA_L2NAME == "TEXAS-LOUISIANA COASTAL PLAIN")
# sample points
modDat_1 <- modDat_1_LA %>%
slice_sample(n = round(nrow(modDat_1_LA)*.3)) %>%
rbind(modDat_1_noLA)
hist(modDat_1[,response], main = paste0("Histogram of ",response),
xlab = paste0(response))
The following are the candidate predictor variables for this ecoregion:
if (ecoregion == "shrubGrass") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" , "prcp" ,"prcp_seasonality" ,"prcpTempCorr" ,
"isothermality" , "annWatDef" ,"sand" ,"coarse" ,
"carbon" , "AWHC" ,"tmin_anom" ,"tmax_anom" ,
"t_warm_anom" , "prcp_wet_anom" ,"precp_dry_anom" ,"prcp_seasonality_anom" ,
"prcpTempCorr_anom" , "aboveFreezingMonth_anom" ,"isothermality_anom" ,"annWatDef_anom" ,
"annWetDegDays_anom", "VPD_mean_anom" ,"VPD_min_anom" ,"frostFreeDays_5_anom" )
} else if (ecoregion == "forest") {
# select potential predictor variables for the ecoregion of interest
prednames <-
c(
"tmean" ,"prcp" , "prcp_dry" , "prcpTempCorr" ,
"isothermality" ,"annWatDef" , "clay" , "sand" ,
"coarse" ,"carbon" , "AWHC" , "tmin_anom" ,
"tmax_anom" ,"prcp_anom" , "prcp_wet_anom" , "precp_dry_anom" ,
"prcp_seasonality_anom" ,"prcpTempCorr_anom" , "aboveFreezingMonth_anom" , "isothermality_anom",
"annWatDef_anom" ,"annWetDegDays_anom" , "VPD_mean_anom" , "VPD_max_95_anom" ,
"frostFreeDays_5_anom" )
}
# subset the data to only include these predictors, and remove any remaining NAs
modDat_1 <- modDat_1 %>%
select(prednames, response, newRegion, Year, Long, Lat, NA_L1NAME, NA_L2NAME) %>%
drop_na()
names(prednames) <- prednames
df_pred <- modDat_1[, prednames]
#
# # print the list of predictor variables
# knitr::kable(format = "html", data.frame("Possible_Predictors" = prednames)
# ) %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1[prednames] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| AWHC | 14.5037 | 0.9085 | 13.6556 | 34.6680 |
| VPD_max_95_anom | 0.0589 | -0.5726 | 0.0367 | 0.8326 |
| VPD_mean_anom | -0.0220 | -0.3148 | -0.0219 | 0.2651 |
| aboveFreezingMonth_anom | 0.1137 | -1.8939 | 0.0667 | 3.3667 |
| annWatDef | 34.6795 | 0.0000 | 24.7127 | 303.5579 |
| annWatDef_anom | -0.0566 | -8.9984 | -0.0757 | 1.0000 |
| annWetDegDays_anom | -0.0059 | -1.2398 | -0.0202 | 0.8050 |
| carbon | 7.9675 | 0.2171 | 4.6003 | 51.0604 |
| clay | 16.1179 | 0.0286 | 15.9399 | 83.0975 |
| coarse | 15.1012 | 0.0000 | 13.2884 | 64.1122 |
| frostFreeDays_5_anom | -25.7789 | -273.1000 | -30.0000 | 53.1000 |
| isothermality | 36.2563 | 21.4503 | 36.4876 | 59.8804 |
| isothermality_anom | 0.7314 | -8.4018 | 0.7281 | 11.7898 |
| prcp | 1112.2163 | 218.1991 | 1133.3410 | 4095.4507 |
| prcpTempCorr | -0.1548 | -0.8596 | -0.1202 | 0.7258 |
| prcpTempCorr_anom | -0.0139 | -0.5978 | -0.0017 | 0.6098 |
| prcp_anom | 0.0056 | -0.7548 | 0.0020 | 0.6720 |
| prcp_dry | 17.0445 | 0.0003 | 12.8220 | 76.7440 |
| prcp_seasonality_anom | -0.0048 | -0.6092 | 0.0030 | 0.4788 |
| prcp_wet_anom | 0.0047 | -1.4179 | 0.0141 | 0.6981 |
| precp_dry_anom | 0.0685 | -9.0000 | 0.0777 | 1.0000 |
| sand | 45.7041 | 0.0098 | 45.0641 | 99.9418 |
| tmax_anom | -0.2860 | -5.6017 | -0.2922 | 4.0197 |
| tmean | 9.4945 | -2.4480 | 8.3730 | 24.9713 |
| tmin_anom | -0.6298 | -5.7692 | -0.5744 | 2.6920 |
# response_summary <- modDat_1 %>%
# dplyr::select(#where(is.numeric), -all_of(pred_vars),
# matches(response)) %>%
# create_summary()
#
#
# kable(response_summary,
# caption = 'summaries of response variables, calculated using paint') %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
set.seed(12011993)
# function for colors
my_fn <- function(data, mapping, method="p", use="pairwise", ...){
# grab data
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
# calculate correlation
corr <- cor(x, y, method=method, use=use)
# calculate colour based on correlation value
# Here I have set a correlation of minus one to blue,
# zero to white, and one to red
# Change this to suit: possibly extend to add as an argument of `my_fn`
colFn <- colorRampPalette(c("red", "white", "blue"), interpolate ='spline')
fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
ggally_cor(data = data, mapping = mapping, size = 2.5, stars = FALSE,
digits = 2, colour = I("black"),...) +
theme_void() +
theme(panel.background = element_rect(fill=fill))
}
if (run == TRUE) {
(corrPlot <- modDat_1 %>%
select(prednames) %>%
slice_sample(n = 5e4) %>%
#select(-matches("_")) %>%
ggpairs( upper = list(continuous = my_fn, size = .1), lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.1)), progress = FALSE))
base::saveRDS(corrPlot, paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
} else {
# corrPlot <- readRDS(paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
# (corrPlot)
print(c("See previous correlation figures"))
}
set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <- modDat_1 %>%
slice_sample(n = 5e4) %>%
rename(response = all_of(response)) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.5 ~ ".5",
response > .5 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
select(c(response, prednames)) %>%
tidyr::pivot_longer(cols = unname(prednames),
names_to = "predictor",
values_to = "value"
)
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor , scales = 'free_y') +
ylab("Predictor Variable Values") +
xlab(response)
modDat_1_s <- modDat_1 %>%
mutate(across(all_of(prednames), base::scale, .names = "{.col}_s"))
names(modDat_1_s) <- c(names(modDat_1),
paste0(prednames, "_s")
)
scaleFigDat_1 <- modDat_1_s %>%
select(c(Long, Lat, Year, prednames)) %>%
pivot_longer(cols = all_of(names(prednames)),
names_to = "predNames",
values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>%
select(c(Long, Lat, Year,paste0(prednames, "_s"))) %>%
pivot_longer(cols = all_of(paste0(prednames, "_s")),
names_to = "predNames",
values_to = "predValues_scaled",
names_sep = ) %>%
mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))
scaleFigDat_3 <- scaleFigDat_1 %>%
left_join(scaleFigDat_2)
ggplot(scaleFigDat_3) +
facet_wrap(~predNames, scales = "free") +
geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") +
geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
xlab ("predictor variable values") +
ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")
## visualize the variation between groups across environmental space
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames, "_s")])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
if (ecoregion == "shrubGrass") {
print("We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain,' 'Texas-Lousiana Coastal plain,' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)." )
modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
modDat_1[modDat_1$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
modDat_1_s[modDat_1_s$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
modDat_1[modDat_1$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "TEXAS-LOUISIANA COASTAL PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
}
# make a table of n for each region
modDat_1 %>%
group_by(NA_L2NAME) %>%
dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>%
rename("Level_2_Ecoregion" = NA_L2NAME)%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Level_2_Ecoregion | Number_Of_Observations |
|---|---|
| ATLANTIC HIGHLANDS | 5686 |
| CENTRAL USA PLAINS | 1253 |
| EVERGLADES | 211 |
| MARINE WEST COAST FOREST | 7457 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 9726 |
| MIXED WOOD PLAINS | 8151 |
| MIXED WOOD SHIELD | 7960 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 15497 |
| SOUTHEASTERN USA PLAINS | 23498 |
| UPPER GILA MOUNTAINS | 9288 |
| WESTERN CORDILLERA | 68909 |
## make data into spatial format
modDat_1_sf <- modDat_1 %>%
st_as_sf(coords = c("Long", "Lat"), crs = st_crs("PROJCRS[\"unnamed\",\n BASEGEOGCRS[\"unknown\",\n DATUM[\"unknown\",\n ELLIPSOID[\"Spheroid\",6378137,298.257223563,\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433,\n ID[\"EPSG\",9122]]]],\n CONVERSION[\"Lambert Conic Conformal (2SP)\",\n METHOD[\"Lambert Conic Conformal (2SP)\",\n ID[\"EPSG\",9802]],\n PARAMETER[\"Latitude of false origin\",42.5,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8821]],\n PARAMETER[\"Longitude of false origin\",-100,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8822]],\n PARAMETER[\"Latitude of 1st standard parallel\",25,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8823]],\n PARAMETER[\"Latitude of 2nd standard parallel\",60,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8824]],\n PARAMETER[\"Easting at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8826]],\n PARAMETER[\"Northing at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8827]]],\n CS[Cartesian,2],\n AXIS[\"easting\",east,\n ORDER[1],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]],\n AXIS[\"northing\",north,\n ORDER[2],\n LENGTHUNIT[\"metre\",1,\n ID[\"EPSG\",9001]]]]"))
# download map info for visualization
data(state_boundaries_wgs84)
cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
dplyr::filter(NAME!="Hawaii") %>%
dplyr::filter(NAME!="Alaska") %>%
dplyr::filter(NAME!="Puerto Rico") %>%
dplyr::filter(NAME!="American Samoa") %>%
dplyr::filter(NAME!="Guam") %>%
dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
dplyr::filter(NAME!="United States Virgin Islands") %>%
sf::st_sf() %>%
sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
#sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))
map1 <- ggplot() +
geom_sf(data=cropped_states,fill='white') +
geom_sf(data=modDat_1_sf,aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
geom_point(data=modDat_1,alpha=0.5,
aes(x = Long, y = Lat, color=as.factor(NA_L2NAME)), alpha = .1) +
#scale_fill_okabeito() +
#scale_color_okabeito() +
# theme_default() +
theme(legend.position = 'none') +
labs(title = "Level 2 Ecoregions as spatial blocks")
hull <- modDat_1_sf %>%
ungroup() %>%
group_by(NA_L2NAME) %>%
slice(chull(tmean, prcp))
plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
theme_minimal() + xlab("Annual Average T_mean - long-term average") +
ylab("Annual Average Precip - long-term average") #+
#scale_color_okabeito() +
#scale_fill_okabeito()
plot2<-ggplot(data=modDat_1_sf %>%
pivot_longer(cols=tmean:prcp),
aes(x=value,group=name)) +
# geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
theme_minimal() +
facet_wrap(~name,scales='free')# +
#scale_color_okabeito() +
#scale_fill_okabeito()
library(patchwork)
(combo <- (map1+plot1)/plot2)
##
## Call: cv.glmnet(x = X[, 2:ncol(X)], y = y, type.measure = "mse", foldid = my_folds, keep = TRUE, parallel = TRUE, family = stats::Gamma(link = "log"), alpha = 1, nlambda = 100, standardize = FALSE)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.00732 46 1008 99.48 102
## 1se 0.05664 24 1102 100.53 19
Then, fit regular glm models (Gamma glm with a log link), first using the coefficients from the ‘best’ lambda identified in the LASSO model, as then using the coefficients from the ‘1SE’ lambda identified from the LASSO (this is the value of lambda such that the cross validation error is within 1 standard error of the minimum).
## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
mat_glmnet_best <- as.matrix(bestLambda_coef)
mat2_glmnet_best <- mat_glmnet_best[mat_glmnet_best[,1] != 0,]
names(mat2_glmnet_best) <- rownames(mat_glmnet_best)[mat_glmnet_best[,1] != 0]
if (length(mat2_glmnet_best) == 1) {
f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_best)[2:length(names( mat2_glmnet_best))], collapse = " + ")))
}
fit_glm_bestLambda <- glm(data = modDat_1_s
, formula = f_glm_bestLambda, family = stats::Gamma(link = "log"))
## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
mat_glmnet_1se <- as.matrix(seLambda_coef)
mat2_glmnet_1se <- mat_glmnet_1se[mat_glmnet_1se[,1] != 0,]
names(mat2_glmnet_1se) <- rownames(mat_glmnet_1se)[mat_glmnet_1se[,1] != 0]
if(length(mat2_glmnet_1se) == 1) {
f_glm_1se <- as.formula(paste0(response, "~ 1"))
} else {
f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_1se)[2:length(names( mat2_glmnet_1se))], collapse = " + ")))
}
fit_glm_se <- glm(data = modDat_1_s, formula = f_glm_1se,
family = stats::Gamma(link = "log"))
Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)
## predict on the test data
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response") - 2
optimal_pred_1se <- predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response") - 2
null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]),
formula = y ~ 1, family = stats::Gamma(link = "log"))
null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
) - 2
# save data
fullModOut <- list(
"modelObject" = fit,
"nullModelObject" = null_fit,
"modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
obs=y,
pred_opt=optimal_pred,
pred_opt_se = optimal_pred_1se,
pred_null=null_pred#,
#pred_nopenalty=nopen_pred
))
# calculate correlations between null and optimal model
my_cors <- c(cor(optimal_pred, c(y-2)),
cor(optimal_pred_1se, c(y-2)),
cor(null_pred, c(y-2))
)
# calculate mse between null and optimal model
my_mse <- c(mean((fullModOut$modelPredictions$pred_opt - c(y-2))^2) ,
mean((fullModOut$modelPredictions$pred_opt_se - c(y-2))^2) ,
mean((fullModOut$modelPredictions$pred_null - c(y-2))^2)#,
#mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
)
ggplot() +
geom_point(aes(X[,2], fullModOut$modelPredictions$obs-2), col = "black", alpha = .1) +
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) +
labs(title = "A rough comparison of observed and model-predicted values",
subtitle = "black = observed values \n red = predictions from 'best lambda' model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
xlab(colnames(X)[2])
#ylim(c(0,200))
The internal cross-validation process to fit the global LASSO model
identified an optimal lambda value (regularization parameter) of
r{print(best_lambda)}. The lambda value such that the cross
validation error is within 1 standard error of the minimum (“1se
lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients
were kept in each model:
# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>%
data.frame()
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model
coef_glm_1se <- coef(fit_glm_se) %>%
data.frame()
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_1se) %>%
select(coefficientName, coefficientValue_bestLambda, coefficientValue_1seLambda)
globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]
## also, get the number of unique variables in each model
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE))
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", "Value from 1se lambda model")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Coefficient Name | Value from best lambda model | Value from 1se lambda model |
|---|---|---|
| (Intercept) | 3.9478279 | 3.8579796 |
| tmean_s | 0.2893448 | 0.1411519 |
| prcp_s | 0.2210049 | 0.1870352 |
| prcp_dry_s | 0.0499957 | 0.2220674 |
| annWatDef_s | -0.2546958 | NA |
| clay_s | -0.1206188 | -0.1312607 |
| sand_s | -0.0346888 | NA |
| carbon_s | 0.0347770 | NA |
| tmax_anom_s | -0.0398442 | NA |
| prcp_anom_s | -0.0361728 | -0.0688326 |
| prcp_seasonality_anom_s | 0.0493580 | 0.0475102 |
| prcpTempCorr_anom_s | -0.0197223 | NA |
| isothermality_anom_s | 0.0894365 | NA |
| VPD_mean_anom_s | -0.0358656 | NA |
| VPD_max_95_anom_s | 0.0647711 | NA |
| frostFreeDays_5_anom_s | 0.0192732 | NA |
| I(tmean_s^2) | -0.1387851 | NA |
| I(prcp_s^2) | -0.0153226 | -0.0506483 |
| I(isothermality_s^2) | -0.0340300 | NA |
| I(tmax_anom_s^2) | 0.0074491 | NA |
| I(precp_dry_anom_s^2) | 0.0017388 | 0.0019985 |
| I(prcpTempCorr_anom_s^2) | 0.0042046 | NA |
| I(aboveFreezingMonth_anom_s^2) | 0.0100035 | NA |
| I(annWatDef_anom_s^2) | 0.0044642 | 0.0026826 |
| I(annWetDegDays_anom_s^2) | -0.0023225 | -0.0097314 |
| I(VPD_mean_anom_s^2) | -0.0053953 | NA |
| I(frostFreeDays_5_anom_s^2) | -0.0030426 | -0.0189883 |
| I(clay_s^2) | 0.0093733 | -0.0063847 |
| I(carbon_s^2) | -0.0214525 | NA |
| I(AWHC_s^2) | -0.0133551 | NA |
| isothermality_anom_s:aboveFreezingMonth_anom_s | 0.0222594 | NA |
| isothermality_anom_s:annWatDef_anom_s | -0.0050785 | NA |
| annWatDef_anom_s:isothermality_s | -0.0400761 | NA |
| prcp_s:annWatDef_anom_s | 0.0187919 | NA |
| prcpTempCorr_anom_s:annWatDef_anom_s | -0.0090701 | NA |
| tmean_s:annWatDef_anom_s | 0.0476442 | NA |
| VPD_max_95_anom_s:annWatDef_anom_s | -0.0102281 | NA |
| annWatDef_s:annWetDegDays_anom_s | 0.0049810 | NA |
| annWatDef_s:frostFreeDays_5_anom_s | -0.0062716 | NA |
| prcp_s:annWatDef_s | 0.1222108 | 0.1576309 |
| annWatDef_s:prcp_seasonality_anom_s | 0.0178860 | NA |
| annWatDef_s:prcpTempCorr_s | -0.0429276 | NA |
| annWatDef_s:precp_dry_anom_s | 0.0002580 | NA |
| annWatDef_s:tmax_anom_s | -0.0303893 | NA |
| tmean_s:annWatDef_s | 0.1146563 | NA |
| annWatDef_s:VPD_mean_anom_s | 0.0076461 | NA |
| prcp_anom_s:annWetDegDays_anom_s | 0.0118080 | NA |
| prcp_s:annWetDegDays_anom_s | 0.0363850 | NA |
| annWetDegDays_anom_s:prcp_wet_anom_s | -0.0245392 | NA |
| annWetDegDays_anom_s:prcpTempCorr_s | 0.0147630 | NA |
| annWetDegDays_anom_s:tmin_anom_s | 0.0075356 | NA |
| VPD_max_95_anom_s:annWetDegDays_anom_s | 0.0112912 | NA |
| isothermality_anom_s:frostFreeDays_5_anom_s | 0.0167243 | NA |
| frostFreeDays_5_anom_s:isothermality_s | -0.0105411 | NA |
| prcp_seasonality_anom_s:frostFreeDays_5_anom_s | 0.0102692 | NA |
| frostFreeDays_5_anom_s:prcpTempCorr_s | -0.0321921 | NA |
| tmean_s:frostFreeDays_5_anom_s | -0.0049639 | NA |
| frostFreeDays_5_anom_s:tmin_anom_s | -0.0183672 | NA |
| VPD_max_95_anom_s:frostFreeDays_5_anom_s | -0.0133029 | NA |
| isothermality_anom_s:isothermality_s | -0.0463367 | NA |
| prcp_s:isothermality_anom_s | -0.0023462 | 0.0363701 |
| prcp_seasonality_anom_s:isothermality_anom_s | -0.0148473 | NA |
| isothermality_anom_s:prcpTempCorr_s | -0.1024205 | -0.0692058 |
| isothermality_anom_s:precp_dry_anom_s | 0.0262211 | NA |
| tmean_s:isothermality_anom_s | 0.0312082 | NA |
| prcp_anom_s:isothermality_s | 0.0341169 | NA |
| tmean_s:isothermality_s | 0.1046638 | NA |
| isothermality_s:tmin_anom_s | -0.0362730 | NA |
| prcp_anom_s:prcpTempCorr_s | 0.0383062 | NA |
| tmax_anom_s:prcp_anom_s | -0.0199885 | NA |
| prcp_anom_s:VPD_max_95_anom_s | 0.0152028 | NA |
| prcp_s:prcp_dry_s | -0.0106868 | NA |
| prcp_dry_s:prcp_seasonality_anom_s | -0.0438651 | NA |
| prcp_dry_s:prcpTempCorr_anom_s | 0.0216261 | NA |
| prcp_dry_s:VPD_mean_anom_s | 0.0093019 | NA |
| prcp_s:prcp_seasonality_anom_s | 0.0247148 | NA |
| prcp_s:prcp_wet_anom_s | 0.0359960 | NA |
| prcp_s:prcpTempCorr_s | 0.1328921 | NA |
| prcp_s:precp_dry_anom_s | -0.0183598 | NA |
| prcp_s:tmin_anom_s | 0.0120349 | NA |
| prcp_s:VPD_max_95_anom_s | -0.0653065 | NA |
| prcp_seasonality_anom_s:prcp_wet_anom_s | 0.0157170 | NA |
| prcp_seasonality_anom_s:prcpTempCorr_s | -0.0113767 | NA |
| tmax_anom_s:prcp_seasonality_anom_s | 0.0344748 | NA |
| prcp_seasonality_anom_s:tmin_anom_s | 0.0029254 | NA |
| prcp_seasonality_anom_s:VPD_max_95_anom_s | -0.0220358 | NA |
| precp_dry_anom_s:prcp_wet_anom_s | 0.0221938 | NA |
| prcpTempCorr_anom_s:prcpTempCorr_s | 0.0379639 | NA |
| prcpTempCorr_anom_s:precp_dry_anom_s | 0.0080300 | NA |
| tmean_s:prcpTempCorr_anom_s | 0.0085671 | NA |
| tmean_s:prcpTempCorr_s | -0.1178572 | -0.1583499 |
| prcpTempCorr_s:tmin_anom_s | -0.0432665 | NA |
| VPD_mean_anom_s:prcpTempCorr_s | 0.0611982 | NA |
| tmax_anom_s:precp_dry_anom_s | 0.0037978 | NA |
| precp_dry_anom_s:tmin_anom_s | 0.0243643 | NA |
| tmax_anom_s:VPD_mean_anom_s | -0.0171309 | NA |
| VPD_max_95_anom_s:tmin_anom_s | -0.0202670 | NA |
| carbon_s:AWHC_s | -0.0132248 | NA |
| clay_s:AWHC_s | -0.0325013 | NA |
| clay_s:carbon_s | -0.0909510 | -0.1102216 |
| carbon_s:coarse_s | 0.0433863 | NA |
| sand_s:carbon_s | 0.0252169 | NA |
| clay_s:coarse_s | 0.0307379 | NA |
| annWetDegDays_anom_s | NA | -0.0327555 |
| prcpTempCorr_s:isothermality_s | NA | -0.0093050 |
# calculate RMSE of both models
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")]-2, truth = "obs", estimate = "pred_opt")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")]-2, truth = "obs", estimate = "pred_opt_se")$.estimate
bias_best <- mean(fullModOut$modelPredictions$obs - fullModOut$modelPredictions$pred_opt)
bias_1se <- mean(fullModOut$modelPredictions$obs - fullModOut$modelPredictions$pred_opt_se)
uniqueCoeffs <- data.frame("Best lambda model" = c(RMSE_best, bias_best,
as.integer(length(globModTerms)-1), as.integer(prednames_fig_num),
as.integer(sum(prednames_fig %in% c(prednames_clim))),
as.integer(sum(prednames_fig %in% c(prednames_weath))),
as.integer(sum(prednames_fig %in% c(prednames_soils)))
),
"1se lambda model" = c(RMSE_1se, bias_1se,
length(globModTerms_1se)-1, prednames_fig_1se_num,
sum(prednames_fig_1se %in% c(prednames_clim)),
sum(prednames_fig_1se %in% c(prednames_weath)),
sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias - mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
"Number of unique climate coefficients",
"Number of unique weather coefficients",
"Number of unique soils coefficients"
)
kable(uniqueCoeffs,
col.names = c("Best lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Best lambda model | 1se lambda model | |
|---|---|---|
| RMSE | 27.41215 | 28.556659 |
| bias - mean(obs-pred.) | 1.58696 | 1.545887 |
| Total number of coefficients | 102.00000 | 19.000000 |
| Number of unique coefficients | 25.00000 | 15.000000 |
| Number of unique climate coefficients | 6.00000 | 6.000000 |
| Number of unique weather coefficients | 14.00000 | 7.000000 |
| Number of unique soils coefficients | 5.00000 | 2.000000 |
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0(response, "_pred")
df_out <- df_out %>% cbind(predict(mod, newx= df_out, #s="lambda.min",
type = "response"))
colnames(df_out)[ncol(df_out)] <- response_name
return(df_out)
}
pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])
# add back in true y values
pred_glm1 <- pred_glm1 %>%
cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test'
colnames(pred_glm1)[which(colnames(pred_glm1) == "y")] <- paste(response)
# add back in lat/long data
pred_glm1 <- pred_glm1 %>%
cbind(modDat_1_s[,c("Long", "Lat", "Year")])
pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 70 | pred_glm1$resid < -70,"extremeResid"] <- 1
# plot(x = pred_glm1[,response],
# y = pred_glm1[,paste0(response, "_pred")],
# xlab = "observed values", ylab = "predicted values")
# points(x = pred_glm1[!is.na(pred_glm1$extremeResid), response],
# y = pred_glm1[!is.na(pred_glm1$extremeResid), paste0(response, "_pred")],
# col = "red")
pred_glm1_1se <- predict_by_response(fit_glm_se, X[,2:ncol(X)])
# add back in true y values
pred_glm1_1se <- pred_glm1_1se %>%
cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test'
colnames(pred_glm1_1se)[which(colnames(pred_glm1_1se) == "y")] <- paste(response)
# add back in lat/long data
pred_glm1_1se <- pred_glm1_1se %>%
cbind(modDat_1_s[,c("Long", "Lat", "Year")])
pred_glm1_1se$resid <- pred_glm1_1se[,response] - pred_glm1_1se[,paste0(response, "_pred")]
pred_glm1_1se$extremeResid <- NA
pred_glm1_1se[pred_glm1_1se$resid > 70 | pred_glm1_1se$resid < -70,"extremeResid"] <- 1
Observations across the temporal range of the dataset
pred_glm1 <- pred_glm1 %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize
# get reference raster
test_rast <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
terra::aggregate(fact = 8, fun = "mean")
## |---------|---------|---------|---------|=========================================
## add ecoregion boundaries (for our ecoregion level model)
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2")
## Reading layer `NA_CEC_Eco_Level2' from data source
## `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/PED_vegClimModels/Data_raw/Level2Ecoregions' using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>%
st_transform(crs = st_crs(test_rast)) %>%
st_make_valid() #%>%
#st_crop(st_bbox(test_rast))
#
# goodRegions_temp <- st_overlaps(y = cropped_states, x = regions, sparse = FALSE) %>%
# rowSums()
# goodRegions <- regions[goodRegions_temp != 0,]
ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)),
"newRegion" = c(NA, "Forest", "dryShrubGrass",
"dryShrubGrass", "Forest", "dryShrubGrass",
"dryShrubGrass", "Forest", "Forest",
"dryShrubGrass", "Forest", "Forest",
"Forest", "Forest", "dryShrubGrass",
NA
))
goodRegions <- regions %>%
left_join(ecoregionLU)
mapRegions <- goodRegions %>%
filter(!is.na(newRegion)) %>%
group_by(newRegion) %>%
summarise(geometry = sf::st_union(geometry)) %>%
ungroup() %>%
st_simplify(dTolerance = 1000)
#mapview(mapRegions)
# rasterize data
plotObs <- pred_glm1 %>%
drop_na(paste(response)) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = response,
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# make figures
ggplot() +
geom_spatraster(data = plotObs_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "darkgreen" ,
midpoint = 0, na.value = "lightgrey") +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
Predictions across the temporal range of the dataset
# rasterize data
plotPred <- pred_glm1 %>%
drop_na(paste0(response,"_pred")) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = paste0(response,"_pred"),
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the point location of those predictions that are > 100
highPred_points <- pred_glm1 %>%
filter(.[[paste0(response,"_pred")]] > 100 |
.[[paste0(response, "_pred")]] < 0) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)
plotPred_2 <- plotPred %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the 'best lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = "bestLambda model") +
scale_fill_gradient2(low = "wheat",
mid = "darkgreen",
high = "red" ,
midpoint = 100, na.value = "lightgrey",
limits = c(0,100)) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
# rasterize data
plotPred <- pred_glm1_1se %>%
drop_na(paste0(response,"_pred")) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = paste0(response,"_pred"),
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the point location of those predictions that are > 100
highPred_points <- pred_glm1_1se %>%
filter(.[[paste0(response,"_pred")]] > 100 |
.[[paste0(response, "_pred")]] < 0) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)
plotPred_2 <- plotPred %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) + geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the '1SE lambda' fitted model of ", response, " in the ",ecoregion, " ecoregion"),
subtitle = "1 SE Lambda model") +
scale_fill_gradient2(low = "wheat",
mid = "darkgreen",
high = "red" ,
midpoint = 100, na.value = "lightgrey",
limits = c(0,100)) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
Residuals across the entire temporal extent of the dataset
# rasterize data
plotResid_rast <- pred_glm1 %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)
plotResid_rast_2 <- plotResid_rast %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- pred_glm1 %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
subtitle = "bestLambda model \n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100,100)
) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1) +
geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") +
geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))
library(ggpubr)
ggarrange(map, hist, heights = c(3,1), ncol = 1)
# rasterize data
plotResid_rast <- pred_glm1_1se %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)
plotResid_rast_2 <- plotResid_rast %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1_1se %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- pred_glm1_1se %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
map <- ggplot() +
geom_spatraster(data =plotResid_rast_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
subtitle = "1 SE Lambda model \n red points indicate locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100,100)
) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
hist <- ggplot(pred_glm1_1se) +
geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") +
geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))
ggarrange(map, hist, heights = c(3,1), ncol = 1)
### Are there biases of the model predictions across year/lat/long?
# plot residuals against Year
yearResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = jitter(Year), y = resid), alpha = .1) +
geom_smooth(aes(x = Year, y = resid)) +
xlab("Year") +
ylab("Residual from best lambda model") +
ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = jitter(Year), y = resid), alpha = .1) +
geom_smooth(aes(x = Year, y = resid)) +
xlab("Year") +
ylab("Residual from 1 SE lambda model")+
ggtitle("from 1 SE lamba model")
# plot residuals against Lat
latResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = Lat, y = resid), alpha = .1) +
geom_smooth(aes(x = Lat, y = resid)) +
xlab("Latitude") +
ylab("Residual from best lambda model") +
ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = Lat, y = resid), alpha = .1) +
geom_smooth(aes(x = Lat, y = resid)) +
xlab("Latitude") +
ylab("Residual from 1 SE lambda model") +
ggtitle("from 1 SE lamba model")
# plot residuals against Long
longResidMod_bestLambda <- ggplot(pred_glm1) +
geom_point(aes(x = Long, y = resid), alpha = .1) +
geom_smooth(aes(x = Long, y = resid)) +
xlab("Longitude") +
ylab("Residual from best lambda model") +
ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se) +
geom_point(aes(x = Long, y = resid), alpha = .1) +
geom_smooth(aes(x = Long, y = resid)) +
xlab("Longitude") +
ylab("Residual from 1 SE lambda model") +
ggtitle("from 1 SE lamba model")
library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) /
( latResidMod_bestLambda + latResidMod_1seLambda) /
( longResidMod_bestLambda + longResidMod_1seLambda)
Binning predictor variables into “Deciles” (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word Decentiles is just a legacy thing (they started out being actual Percentiles)
# get deciles for best lambda model
if (length(prednames_fig) == 0) {
print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
cut_points = seq(0, 1, 0.005))
}
## Processed 1743 groups out of 4788. 36% done. Time elapsed: 3s. ETA: 5s.Processed 2437 groups out of 4788. 51% done. Time elapsed: 4s. ETA: 3s.Processed 3007 groups out of 4788. 63% done. Time elapsed: 5s. ETA: 2s.Processed 4141 groups out of 4788. 86% done. Time elapsed: 6s. ETA: 0s.Processed 4219 groups out of 4788. 88% done. Time elapsed: 7s. ETA: 0s.Processed 4768 groups out of 4788. 100% done. Time elapsed: 8s. ETA: 0s.Processed 4788 groups out of 4788. 100% done. Time elapsed: 8s. ETA: 0s.
# get deciles for 1 SE lambda model
if (length(prednames_fig_1se) == 0) {
print("The 1SE lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se,
response_vars = response_vars,
pred_vars = prednames_fig_1se,
cut_points = seq(0, 1, 0.005))
}
## Processed 1916 groups out of 2839. 67% done. Time elapsed: 3s. ETA: 1s.Processed 2495 groups out of 2839. 88% done. Time elapsed: 4s. ETA: 0s.Processed 2839 groups out of 2839. 100% done. Time elapsed: 4s. ETA: 0s.
Here is a quick version of LOESS curves fit to raw data (to double-check the quantile plot calculations)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1 %>%
select(all_of(c(prednames_fig, response_vars))) %>%
pivot_longer(cols = prednames_fig) %>%
ggplot() +
facet_wrap(~name, scales = "free") +
geom_point(aes(x = value, y = .data[[paste(response)]]), col = "darkblue", alpha = .1) + # observed values
geom_point(aes(x = value, y = .data[[response_vars[2]]]), col = "lightblue", alpha = .1) + # model-predicted values
geom_smooth(aes(x = value, y = .data[[paste(response)]]), col = "black", se = FALSE) +
geom_smooth(aes(x = value, y = .data[[response_vars[2]]]), col = "brown", se = FALSE)
}
Below are the actual quantile plots (note that the predictor variables are scaled)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response, IQR = TRUE) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, dfRaw = pred_glm1, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g4)
dev.off()
}
g4
}
if (length(prednames_fig_1se) == 0) {
print("The 1 se lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = response, IQR = TRUE) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, dfRaw = pred_glm1_1se, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g4)
dev.off()
}
g4
}
20th and 80th percentiles for each climate variable
df <- pred_glm1[, prednames_fig] #%>%
#mutate(MAT = MAT - 273.15) # k to c
quantiles <- map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each predictor variable.
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
filter_var = TRUE,
filter_vars = prednames_fig,
cut_points = seq(0, 1, 0.005))
decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = prednames_fig)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)
}
## Processed 3326 groups out of 238300. 1% done. Time elapsed: 3s. ETA: 211s.Processed 4898 groups out of 238300. 2% done. Time elapsed: 4s. ETA: 190s.Processed 5895 groups out of 238300. 2% done. Time elapsed: 5s. ETA: 197s.Processed 7525 groups out of 238300. 3% done. Time elapsed: 6s. ETA: 184s.Processed 8357 groups out of 238300. 4% done. Time elapsed: 7s. ETA: 192s.Processed 9728 groups out of 238300. 4% done. Time elapsed: 8s. ETA: 188s.Processed 10308 groups out of 238300. 4% done. Time elapsed: 9s. ETA: 199s.Processed 12280 groups out of 238300. 5% done. Time elapsed: 10s. ETA: 184s.Processed 13310 groups out of 238300. 6% done. Time elapsed: 11s. ETA: 186s.Processed 14525 groups out of 238300. 6% done. Time elapsed: 12s. ETA: 185s.Processed 15420 groups out of 238300. 6% done. Time elapsed: 13s. ETA: 188s.Processed 16030 groups out of 238300. 7% done. Time elapsed: 14s. ETA: 194s.Processed 16937 groups out of 238300. 7% done. Time elapsed: 15s. ETA: 196s.Processed 18044 groups out of 238300. 8% done. Time elapsed: 16s. ETA: 196s.Processed 19273 groups out of 238300. 8% done. Time elapsed: 17s. ETA: 194s.Processed 20622 groups out of 238300. 9% done. Time elapsed: 18s. ETA: 192s.Processed 21858 groups out of 238300. 9% done. Time elapsed: 19s. ETA: 190s.Processed 23200 groups out of 238300. 10% done. Time elapsed: 20s. ETA: 191s.Processed 24740 groups out of 238300. 10% done. Time elapsed: 21s. ETA: 187s.Processed 25777 groups out of 238300. 11% done. Time elapsed: 22s. ETA: 187s.Processed 26914 groups out of 238300. 11% done. Time elapsed: 23s. ETA: 186s.Processed 27712 groups out of 238300. 12% done. Time elapsed: 24s. ETA: 188s.Processed 28357 groups out of 238300. 12% done. Time elapsed: 25s. ETA: 191s.Processed 29701 groups out of 238300. 12% done. Time elapsed: 27s. ETA: 189s.Processed 30447 groups out of 238300. 13% done. Time elapsed: 28s. ETA: 191s.Processed 30937 groups out of 238300. 13% done. Time elapsed: 29s. ETA: 196s.Processed 31933 groups out of 238300. 13% done. Time elapsed: 30s. ETA: 196s.Processed 33020 groups out of 238300. 14% done. Time elapsed: 31s. ETA: 195s.Processed 33564 groups out of 238300. 14% done. Time elapsed: 32s. ETA: 197s.Processed 34683 groups out of 238300. 15% done. Time elapsed: 33s. ETA: 195s.Processed 35787 groups out of 238300. 15% done. Time elapsed: 34s. ETA: 194s.Processed 36417 groups out of 238300. 15% done. Time elapsed: 35s. ETA: 196s.Processed 37468 groups out of 238300. 16% done. Time elapsed: 36s. ETA: 195s.Processed 38680 groups out of 238300. 16% done. Time elapsed: 37s. ETA: 194s.Processed 39611 groups out of 238300. 17% done. Time elapsed: 38s. ETA: 194s.Processed 40449 groups out of 238300. 17% done. Time elapsed: 39s. ETA: 194s.Processed 41258 groups out of 238300. 17% done. Time elapsed: 41s. ETA: 196s.Processed 42395 groups out of 238300. 18% done. Time elapsed: 42s. ETA: 195s.Processed 43575 groups out of 238300. 18% done. Time elapsed: 43s. ETA: 193s.Processed 43859 groups out of 238300. 18% done. Time elapsed: 44s. ETA: 196s.Processed 45020 groups out of 238300. 19% done. Time elapsed: 45s. ETA: 194s.Processed 46149 groups out of 238300. 19% done. Time elapsed: 46s. ETA: 192s.Processed 47089 groups out of 238300. 20% done. Time elapsed: 47s. ETA: 191s.Processed 48530 groups out of 238300. 20% done. Time elapsed: 48s. ETA: 188s.Processed 49522 groups out of 238300. 21% done. Time elapsed: 49s. ETA: 187s.Processed 50702 groups out of 238300. 21% done. Time elapsed: 50s. ETA: 185s.Processed 51569 groups out of 238300. 22% done. Time elapsed: 51s. ETA: 185s.Processed 53178 groups out of 238300. 22% done. Time elapsed: 52s. ETA: 181s.Processed 54148 groups out of 238300. 23% done. Time elapsed: 53s. ETA: 181s.Processed 55933 groups out of 238300. 23% done. Time elapsed: 54s. ETA: 177s.Processed 56726 groups out of 238300. 24% done. Time elapsed: 58s. ETA: 187s.Processed 57486 groups out of 238300. 24% done. Time elapsed: 59s. ETA: 187s.Processed 59229 groups out of 238300. 25% done. Time elapsed: 60s. ETA: 183s.Processed 60081 groups out of 238300. 25% done. Time elapsed: 61s. ETA: 182s.Processed 60846 groups out of 238300. 26% done. Time elapsed: 62s. ETA: 182s.Processed 61726 groups out of 238300. 26% done. Time elapsed: 63s. ETA: 181s.Processed 62334 groups out of 238300. 26% done. Time elapsed: 64s. ETA: 182s.Processed 63903 groups out of 238300. 27% done. Time elapsed: 65s. ETA: 178s.Processed 64948 groups out of 238300. 27% done. Time elapsed: 66s. ETA: 177s.Processed 66675 groups out of 238300. 28% done. Time elapsed: 67s. ETA: 173s.Processed 67484 groups out of 238300. 28% done. Time elapsed: 68s. ETA: 173s.Processed 68816 groups out of 238300. 29% done. Time elapsed: 69s. ETA: 171s.Processed 69798 groups out of 238300. 29% done. Time elapsed: 70s. ETA: 170s.Processed 71412 groups out of 238300. 30% done. Time elapsed: 71s. ETA: 167s.Processed 72275 groups out of 238300. 30% done. Time elapsed: 72s. ETA: 166s.Processed 73763 groups out of 238300. 31% done. Time elapsed: 73s. ETA: 164s.Processed 74614 groups out of 238300. 31% done. Time elapsed: 74s. ETA: 163s.Processed 75828 groups out of 238300. 32% done. Time elapsed: 75s. ETA: 161s.Processed 77371 groups out of 238300. 32% done. Time elapsed: 76s. ETA: 159s.Processed 78993 groups out of 238300. 33% done. Time elapsed: 77s. ETA: 157s.Processed 80133 groups out of 238300. 34% done. Time elapsed: 78s. ETA: 155s.Processed 81450 groups out of 238300. 34% done. Time elapsed: 79s. ETA: 153s.Processed 82536 groups out of 238300. 35% done. Time elapsed: 80s. ETA: 152s.Processed 83800 groups out of 238300. 35% done. Time elapsed: 81s. ETA: 151s.Processed 85118 groups out of 238300. 36% done. Time elapsed: 83s. ETA: 149s.Processed 86631 groups out of 238300. 36% done. Time elapsed: 84s. ETA: 147s.Processed 87719 groups out of 238300. 37% done. Time elapsed: 85s. ETA: 146s.Processed 89544 groups out of 238300. 38% done. Time elapsed: 86s. ETA: 143s.Processed 90281 groups out of 238300. 38% done. Time elapsed: 87s. ETA: 142s.Processed 91415 groups out of 238300. 38% done. Time elapsed: 88s. ETA: 141s.Processed 92333 groups out of 238300. 39% done. Time elapsed: 89s. ETA: 140s.Processed 93387 groups out of 238300. 39% done. Time elapsed: 90s. ETA: 139s.Processed 94997 groups out of 238300. 40% done. Time elapsed: 91s. ETA: 137s.Processed 96015 groups out of 238300. 40% done. Time elapsed: 92s. ETA: 136s.Processed 96920 groups out of 238300. 41% done. Time elapsed: 93s. ETA: 135s.Processed 97783 groups out of 238300. 41% done. Time elapsed: 94s. ETA: 135s.Processed 98280 groups out of 238300. 41% done. Time elapsed: 95s. ETA: 135s.Processed 99236 groups out of 238300. 42% done. Time elapsed: 96s. ETA: 134s.Processed 100479 groups out of 238300. 42% done. Time elapsed: 97s. ETA: 133s.Processed 101685 groups out of 238300. 43% done. Time elapsed: 98s. ETA: 131s.Processed 103185 groups out of 238300. 43% done. Time elapsed: 99s. ETA: 130s.Processed 104820 groups out of 238300. 44% done. Time elapsed: 100s. ETA: 127s.Processed 105902 groups out of 238300. 44% done. Time elapsed: 101s. ETA: 126s.Processed 106971 groups out of 238300. 45% done. Time elapsed: 102s. ETA: 125s.Processed 107408 groups out of 238300. 45% done. Time elapsed: 103s. ETA: 126s.Processed 108175 groups out of 238300. 45% done. Time elapsed: 104s. ETA: 125s.Processed 109345 groups out of 238300. 46% done. Time elapsed: 105s. ETA: 124s.Processed 110812 groups out of 238300. 47% done. Time elapsed: 106s. ETA: 122s.Processed 111896 groups out of 238300. 47% done. Time elapsed: 107s. ETA: 121s.Processed 113472 groups out of 238300. 48% done. Time elapsed: 108s. ETA: 119s.Processed 114697 groups out of 238300. 48% done. Time elapsed: 109s. ETA: 117s.Processed 115857 groups out of 238300. 49% done. Time elapsed: 110s. ETA: 116s.Processed 116999 groups out of 238300. 49% done. Time elapsed: 111s. ETA: 115s.Processed 118490 groups out of 238300. 50% done. Time elapsed: 112s. ETA: 113s.Processed 119419 groups out of 238300. 50% done. Time elapsed: 113s. ETA: 112s.Processed 120790 groups out of 238300. 51% done. Time elapsed: 114s. ETA: 111s.Processed 121980 groups out of 238300. 51% done. Time elapsed: 115s. ETA: 110s.Processed 123712 groups out of 238300. 52% done. Time elapsed: 116s. ETA: 107s.Processed 124745 groups out of 238300. 52% done. Time elapsed: 117s. ETA: 106s.Processed 125866 groups out of 238300. 53% done. Time elapsed: 118s. ETA: 105s.Processed 126826 groups out of 238300. 53% done. Time elapsed: 119s. ETA: 105s.Processed 128167 groups out of 238300. 54% done. Time elapsed: 120s. ETA: 103s.Processed 129377 groups out of 238300. 54% done. Time elapsed: 121s. ETA: 102s.Processed 131066 groups out of 238300. 55% done. Time elapsed: 122s. ETA: 100s.Processed 132200 groups out of 238300. 55% done. Time elapsed: 123s. ETA: 99s.Processed 133274 groups out of 238300. 56% done. Time elapsed: 124s. ETA: 98s.Processed 134357 groups out of 238300. 56% done. Time elapsed: 125s. ETA: 97s.Processed 135137 groups out of 238300. 57% done. Time elapsed: 126s. ETA: 96s.Processed 135859 groups out of 238300. 57% done. Time elapsed: 127s. ETA: 96s.Processed 136684 groups out of 238300. 57% done. Time elapsed: 128s. ETA: 95s.Processed 137327 groups out of 238300. 58% done. Time elapsed: 129s. ETA: 95s.Processed 138392 groups out of 238300. 58% done. Time elapsed: 130s. ETA: 94s.Processed 139290 groups out of 238300. 58% done. Time elapsed: 131s. ETA: 93s.Processed 140456 groups out of 238300. 59% done. Time elapsed: 132s. ETA: 92s.Processed 141334 groups out of 238300. 59% done. Time elapsed: 133s. ETA: 91s.Processed 141868 groups out of 238300. 60% done. Time elapsed: 134s. ETA: 91s.Processed 143723 groups out of 238300. 60% done. Time elapsed: 135s. ETA: 89s.Processed 144858 groups out of 238300. 61% done. Time elapsed: 136s. ETA: 88s.Processed 146557 groups out of 238300. 62% done. Time elapsed: 137s. ETA: 86s.Processed 147268 groups out of 238300. 62% done. Time elapsed: 138s. ETA: 85s.Processed 148506 groups out of 238300. 62% done. Time elapsed: 139s. ETA: 84s.Processed 149605 groups out of 238300. 63% done. Time elapsed: 141s. ETA: 83s.Processed 151253 groups out of 238300. 63% done. Time elapsed: 142s. ETA: 81s.Processed 152182 groups out of 238300. 64% done. Time elapsed: 143s. ETA: 80s.Processed 153900 groups out of 238300. 65% done. Time elapsed: 144s. ETA: 78s.Processed 154832 groups out of 238300. 65% done. Time elapsed: 145s. ETA: 78s.Processed 156549 groups out of 238300. 66% done. Time elapsed: 146s. ETA: 76s.Processed 157440 groups out of 238300. 66% done. Time elapsed: 147s. ETA: 75s.Processed 158564 groups out of 238300. 67% done. Time elapsed: 148s. ETA: 74s.Processed 159918 groups out of 238300. 67% done. Time elapsed: 149s. ETA: 73s.Processed 161626 groups out of 238300. 68% done. Time elapsed: 150s. ETA: 71s.Processed 162849 groups out of 238300. 68% done. Time elapsed: 151s. ETA: 70s.Processed 164295 groups out of 238300. 69% done. Time elapsed: 152s. ETA: 68s.Processed 165359 groups out of 238300. 69% done. Time elapsed: 153s. ETA: 67s.Processed 166430 groups out of 238300. 70% done. Time elapsed: 154s. ETA: 66s.Processed 167313 groups out of 238300. 70% done. Time elapsed: 155s. ETA: 65s.Processed 168285 groups out of 238300. 71% done. Time elapsed: 156s. ETA: 65s.Processed 169271 groups out of 238300. 71% done. Time elapsed: 157s. ETA: 64s.Processed 170278 groups out of 238300. 71% done. Time elapsed: 158s. ETA: 63s.Processed 171703 groups out of 238300. 72% done. Time elapsed: 159s. ETA: 61s.Processed 172809 groups out of 238300. 73% done. Time elapsed: 160s. ETA: 60s.Processed 174726 groups out of 238300. 73% done. Time elapsed: 161s. ETA: 58s.Processed 175750 groups out of 238300. 74% done. Time elapsed: 162s. ETA: 57s.Processed 177406 groups out of 238300. 74% done. Time elapsed: 163s. ETA: 56s.Processed 177968 groups out of 238300. 75% done. Time elapsed: 164s. ETA: 55s.Processed 179662 groups out of 238300. 75% done. Time elapsed: 165s. ETA: 54s.Processed 180912 groups out of 238300. 76% done. Time elapsed: 166s. ETA: 52s.Processed 182425 groups out of 238300. 77% done. Time elapsed: 167s. ETA: 51s.Processed 183460 groups out of 238300. 77% done. Time elapsed: 168s. ETA: 50s.Processed 184897 groups out of 238300. 78% done. Time elapsed: 170s. ETA: 49s.Processed 185705 groups out of 238300. 78% done. Time elapsed: 171s. ETA: 48s.Processed 187468 groups out of 238300. 79% done. Time elapsed: 172s. ETA: 46s.Processed 188603 groups out of 238300. 79% done. Time elapsed: 173s. ETA: 45s.Processed 190260 groups out of 238300. 80% done. Time elapsed: 174s. ETA: 43s.Processed 190864 groups out of 238300. 80% done. Time elapsed: 175s. ETA: 43s.Processed 192019 groups out of 238300. 81% done. Time elapsed: 176s. ETA: 42s.Processed 192939 groups out of 238300. 81% done. Time elapsed: 177s. ETA: 41s.Processed 193680 groups out of 238300. 81% done. Time elapsed: 178s. ETA: 41s.Processed 194576 groups out of 238300. 82% done. Time elapsed: 179s. ETA: 40s.Processed 195681 groups out of 238300. 82% done. Time elapsed: 180s. ETA: 39s.Processed 196605 groups out of 238300. 83% done. Time elapsed: 181s. ETA: 38s.Processed 197541 groups out of 238300. 83% done. Time elapsed: 182s. ETA: 37s.Processed 198601 groups out of 238300. 83% done. Time elapsed: 183s. ETA: 36s.Processed 200201 groups out of 238300. 84% done. Time elapsed: 184s. ETA: 35s.Processed 201204 groups out of 238300. 84% done. Time elapsed: 185s. ETA: 34s.Processed 202776 groups out of 238300. 85% done. Time elapsed: 186s. ETA: 32s.Processed 203765 groups out of 238300. 86% done. Time elapsed: 187s. ETA: 31s.Processed 205439 groups out of 238300. 86% done. Time elapsed: 188s. ETA: 30s.Processed 206701 groups out of 238300. 87% done. Time elapsed: 189s. ETA: 28s.Processed 208192 groups out of 238300. 87% done. Time elapsed: 190s. ETA: 27s.Processed 209394 groups out of 238300. 88% done. Time elapsed: 191s. ETA: 26s.Processed 210974 groups out of 238300. 89% done. Time elapsed: 192s. ETA: 24s.Processed 212156 groups out of 238300. 89% done. Time elapsed: 193s. ETA: 23s.Processed 212896 groups out of 238300. 89% done. Time elapsed: 194s. ETA: 23s.Processed 214138 groups out of 238300. 90% done. Time elapsed: 195s. ETA: 22s.Processed 215809 groups out of 238300. 91% done. Time elapsed: 196s. ETA: 20s.Processed 216803 groups out of 238300. 91% done. Time elapsed: 197s. ETA: 19s.Processed 218670 groups out of 238300. 92% done. Time elapsed: 198s. ETA: 17s.Processed 219398 groups out of 238300. 92% done. Time elapsed: 199s. ETA: 17s.Processed 220448 groups out of 238300. 93% done. Time elapsed: 200s. ETA: 16s.Processed 221820 groups out of 238300. 93% done. Time elapsed: 201s. ETA: 14s.Processed 223513 groups out of 238300. 94% done. Time elapsed: 202s. ETA: 13s.Processed 224688 groups out of 238300. 94% done. Time elapsed: 203s. ETA: 12s.Processed 226126 groups out of 238300. 95% done. Time elapsed: 204s. ETA: 11s.Processed 226978 groups out of 238300. 95% done. Time elapsed: 206s. ETA: 10s.Processed 227680 groups out of 238300. 96% done. Time elapsed: 207s. ETA: 9s.Processed 228390 groups out of 238300. 96% done. Time elapsed: 208s. ETA: 9s.Processed 228881 groups out of 238300. 96% done. Time elapsed: 209s. ETA: 8s.Processed 229557 groups out of 238300. 96% done. Time elapsed: 210s. ETA: 8s.Processed 231093 groups out of 238300. 97% done. Time elapsed: 211s. ETA: 6s.Processed 232136 groups out of 238300. 97% done. Time elapsed: 212s. ETA: 5s.Processed 234000 groups out of 238300. 98% done. Time elapsed: 213s. ETA: 3s.Processed 235140 groups out of 238300. 99% done. Time elapsed: 214s. ETA: 2s.Processed 236324 groups out of 238300. 99% done. Time elapsed: 215s. ETA: 1s.Processed 237295 groups out of 238300. 100% done. Time elapsed: 216s. ETA: 0s.Processed 238300 groups out of 238300. 100% done. Time elapsed: 217s. ETA: 0s.
Filtered quantile figure with middle 2 deciles also shown
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = prednames_fig,
filter_vars = prednames_fig,
filter_var = TRUE,
add_mid = TRUE,
cut_points = seq(0, 1, 0.005))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = prednames_fig)
g
if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
units = "in", res = 600, width = 5.5, height = 6 )
g
dev.off()
}
}
## Processed 2816 groups out of 357830. 1% done. Time elapsed: 3s. ETA: 378s.Processed 4101 groups out of 357830. 1% done. Time elapsed: 5s. ETA: 442s.Processed 5569 groups out of 357830. 2% done. Time elapsed: 6s. ETA: 387s.Processed 6455 groups out of 357830. 2% done. Time elapsed: 11s. ETA: 622s.Processed 7864 groups out of 357830. 2% done. Time elapsed: 12s. ETA: 553s.Processed 8815 groups out of 357830. 2% done. Time elapsed: 13s. ETA: 534s.Processed 10548 groups out of 357830. 3% done. Time elapsed: 14s. ETA: 477s.Processed 11336 groups out of 357830. 3% done. Time elapsed: 15s. ETA: 473s.Processed 12981 groups out of 357830. 4% done. Time elapsed: 16s. ETA: 438s.Processed 13534 groups out of 357830. 4% done. Time elapsed: 17s. ETA: 454s.Processed 14447 groups out of 357830. 4% done. Time elapsed: 18s. ETA: 448s.Processed 15492 groups out of 357830. 4% done. Time elapsed: 19s. ETA: 438s.Processed 15980 groups out of 357830. 4% done. Time elapsed: 20s. ETA: 446s.Processed 16900 groups out of 357830. 5% done. Time elapsed: 21s. ETA: 441s.Processed 17914 groups out of 357830. 5% done. Time elapsed: 22s. ETA: 433s.Processed 18427 groups out of 357830. 5% done. Time elapsed: 23s. ETA: 439s.Processed 19452 groups out of 357830. 5% done. Time elapsed: 24s. ETA: 432s.Processed 20254 groups out of 357830. 6% done. Time elapsed: 25s. ETA: 431s.Processed 21057 groups out of 357830. 6% done. Time elapsed: 26s. ETA: 429s.Processed 22442 groups out of 357830. 6% done. Time elapsed: 27s. ETA: 416s.Processed 23512 groups out of 357830. 7% done. Time elapsed: 28s. ETA: 410s.Processed 24936 groups out of 357830. 7% done. Time elapsed: 29s. ETA: 398s.Processed 25968 groups out of 357830. 7% done. Time elapsed: 30s. ETA: 394s.Processed 27443 groups out of 357830. 8% done. Time elapsed: 31s. ETA: 383s.Processed 28508 groups out of 357830. 8% done. Time elapsed: 32s. ETA: 379s.Processed 29953 groups out of 357830. 8% done. Time elapsed: 33s. ETA: 370s.Processed 30778 groups out of 357830. 9% done. Time elapsed: 34s. ETA: 370s.Processed 32209 groups out of 357830. 9% done. Time elapsed: 35s. ETA: 362s.Processed 32992 groups out of 357830. 9% done. Time elapsed: 36s. ETA: 362s.Processed 34062 groups out of 357830. 10% done. Time elapsed: 37s. ETA: 359s.Processed 34727 groups out of 357830. 10% done. Time elapsed: 38s. ETA: 361s.Processed 34811 groups out of 357830. 10% done. Time elapsed: 39s. ETA: 369s.Processed 35326 groups out of 357830. 10% done. Time elapsed: 40s. ETA: 373s.Processed 36250 groups out of 357830. 10% done. Time elapsed: 41s. ETA: 371s.Processed 37103 groups out of 357830. 10% done. Time elapsed: 43s. ETA: 374s.Processed 38377 groups out of 357830. 11% done. Time elapsed: 44s. ETA: 368s.Processed 39460 groups out of 357830. 11% done. Time elapsed: 45s. ETA: 368s.Processed 40584 groups out of 357830. 11% done. Time elapsed: 46s. ETA: 365s.Processed 41819 groups out of 357830. 12% done. Time elapsed: 47s. ETA: 361s.Processed 42561 groups out of 357830. 12% done. Time elapsed: 48s. ETA: 361s.Processed 43384 groups out of 357830. 12% done. Time elapsed: 49s. ETA: 360s.Processed 44176 groups out of 357830. 12% done. Time elapsed: 51s. ETA: 362s.Processed 45570 groups out of 357830. 13% done. Time elapsed: 52s. ETA: 356s.Processed 46317 groups out of 357830. 13% done. Time elapsed: 53s. ETA: 356s.Processed 46975 groups out of 357830. 13% done. Time elapsed: 54s. ETA: 357s.Processed 47784 groups out of 357830. 13% done. Time elapsed: 55s. ETA: 357s.Processed 48654 groups out of 357830. 14% done. Time elapsed: 56s. ETA: 356s.Processed 49332 groups out of 357830. 14% done. Time elapsed: 57s. ETA: 356s.Processed 50399 groups out of 357830. 14% done. Time elapsed: 58s. ETA: 354s.Processed 51253 groups out of 357830. 14% done. Time elapsed: 59s. ETA: 354s.Processed 52317 groups out of 357830. 15% done. Time elapsed: 60s. ETA: 352s.Processed 53229 groups out of 357830. 15% done. Time elapsed: 61s. ETA: 350s.Processed 53819 groups out of 357830. 15% done. Time elapsed: 62s. ETA: 351s.Processed 54850 groups out of 357830. 15% done. Time elapsed: 63s. ETA: 349s.Processed 55872 groups out of 357830. 16% done. Time elapsed: 64s. ETA: 347s.Processed 56513 groups out of 357830. 16% done. Time elapsed: 65s. ETA: 348s.Processed 57676 groups out of 357830. 16% done. Time elapsed: 66s. ETA: 344s.Processed 58328 groups out of 357830. 16% done. Time elapsed: 67s. ETA: 345s.Processed 59518 groups out of 357830. 17% done. Time elapsed: 68s. ETA: 342s.Processed 60451 groups out of 357830. 17% done. Time elapsed: 69s. ETA: 340s.Processed 61207 groups out of 357830. 17% done. Time elapsed: 70s. ETA: 340s.Processed 62176 groups out of 357830. 17% done. Time elapsed: 71s. ETA: 339s.Processed 63041 groups out of 357830. 18% done. Time elapsed: 72s. ETA: 339s.Processed 64145 groups out of 357830. 18% done. Time elapsed: 73s. ETA: 336s.Processed 65202 groups out of 357830. 18% done. Time elapsed: 74s. ETA: 334s.Processed 66027 groups out of 357830. 18% done. Time elapsed: 75s. ETA: 333s.Processed 67000 groups out of 357830. 19% done. Time elapsed: 76s. ETA: 332s.Processed 67756 groups out of 357830. 19% done. Time elapsed: 77s. ETA: 332s.Processed 68888 groups out of 357830. 19% done. Time elapsed: 78s. ETA: 330s.Processed 70002 groups out of 357830. 20% done. Time elapsed: 79s. ETA: 327s.Processed 70886 groups out of 357830. 20% done. Time elapsed: 80s. ETA: 326s.Processed 71963 groups out of 357830. 20% done. Time elapsed: 81s. ETA: 324s.Processed 72551 groups out of 357830. 20% done. Time elapsed: 82s. ETA: 325s.Processed 73527 groups out of 357830. 21% done. Time elapsed: 83s. ETA: 324s.Processed 74216 groups out of 357830. 21% done. Time elapsed: 85s. ETA: 324s.Processed 74823 groups out of 357830. 21% done. Time elapsed: 86s. ETA: 327s.Processed 76196 groups out of 357830. 21% done. Time elapsed: 87s. ETA: 323s.Processed 77179 groups out of 357830. 22% done. Time elapsed: 88s. ETA: 322s.Processed 79002 groups out of 357830. 22% done. Time elapsed: 89s. ETA: 316s.Processed 81046 groups out of 357830. 23% done. Time elapsed: 90s. ETA: 310s.Processed 82702 groups out of 357830. 23% done. Time elapsed: 91s. ETA: 305s.Processed 84427 groups out of 357830. 24% done. Time elapsed: 92s. ETA: 300s.Processed 86604 groups out of 357830. 24% done. Time elapsed: 93s. ETA: 293s.Processed 88961 groups out of 357830. 25% done. Time elapsed: 95s. ETA: 287s.Processed 89153 groups out of 357830. 25% done. Time elapsed: 134164s. ETA: 404325s.Processed 89377 groups out of 357830. 25% done. Time elapsed: 134165s. ETA: 402979s.Processed 90441 groups out of 357830. 25% done. Time elapsed: 134166s. ETA: 396662s.Processed 93347 groups out of 357830. 26% done. Time elapsed: 134167s. ETA: 380140s.Processed 97623 groups out of 357830. 27% done. Time elapsed: 134168s. ETA: 357615s.Processed 101975 groups out of 357830. 28% done. Time elapsed: 134169s. ETA: 336630s.Processed 106416 groups out of 357830. 30% done. Time elapsed: 134170s. ETA: 316985s.Processed 110877 groups out of 357830. 31% done. Time elapsed: 134171s. ETA: 298835s.Processed 115371 groups out of 357830. 32% done. Time elapsed: 134172s. ETA: 281970s.Processed 119956 groups out of 357830. 34% done. Time elapsed: 134173s. ETA: 266066s.Processed 124536 groups out of 357830. 35% done. Time elapsed: 134174s. ETA: 251349s.Processed 129150 groups out of 357830. 36% done. Time elapsed: 134175s. ETA: 237578s.Processed 133769 groups out of 357830. 37% done. Time elapsed: 134176s. ETA: 224743s.Processed 138488 groups out of 357830. 39% done. Time elapsed: 134177s. ETA: 212514s.Processed 143206 groups out of 357830. 40% done. Time elapsed: 134178s. ETA: 201094s.Processed 147322 groups out of 357830. 41% done. Time elapsed: 134179s. ETA: 191728s.Processed 149113 groups out of 357830. 42% done. Time elapsed: 134180s. ETA: 187815s.Processed 153051 groups out of 357830. 43% done. Time elapsed: 134181s. ETA: 179531s.Processed 157655 groups out of 357830. 44% done. Time elapsed: 134182s. ETA: 170371s.Processed 162067 groups out of 357830. 45% done. Time elapsed: 134183s. ETA: 162082s.Processed 166779 groups out of 357830. 47% done. Time elapsed: 134184s. ETA: 153712s.Processed 171490 groups out of 357830. 48% done. Time elapsed: 134185s. ETA: 145805s.Processed 176205 groups out of 357830. 49% done. Time elapsed: 134186s. ETA: 138314s.Processed 181084 groups out of 357830. 51% done. Time elapsed: 134187s. ETA: 130972s.Processed 185958 groups out of 357830. 52% done. Time elapsed: 134188s. ETA: 124024s.Processed 190061 groups out of 357830. 53% done. Time elapsed: 134189s. ETA: 118450s.Processed 194470 groups out of 357830. 54% done. Time elapsed: 134190s. ETA: 112723s.Processed 199407 groups out of 357830. 56% done. Time elapsed: 134191s. ETA: 106611s.Processed 203985 groups out of 357830. 57% done. Time elapsed: 134192s. ETA: 101207s.Processed 209156 groups out of 357830. 58% done. Time elapsed: 134193s. ETA: 95388s.Processed 212425 groups out of 357830. 59% done. Time elapsed: 134194s. ETA: 91856s.Processed 217150 groups out of 357830. 61% done. Time elapsed: 134195s. ETA: 86938s.Processed 222234 groups out of 357830. 62% done. Time elapsed: 134196s. ETA: 81879s.Processed 227247 groups out of 357830. 64% done. Time elapsed: 134197s. ETA: 77113s.Processed 230392 groups out of 357830. 64% done. Time elapsed: 134200s. ETA: 74230s.Processed 235107 groups out of 357830. 66% done. Time elapsed: 134201s. ETA: 70051s.Processed 240287 groups out of 357830. 67% done. Time elapsed: 134202s. ETA: 65648s.Processed 245599 groups out of 357830. 69% done. Time elapsed: 134203s. ETA: 61326s.Processed 250909 groups out of 357830. 70% done. Time elapsed: 134204s. ETA: 57189s.Processed 256320 groups out of 357830. 72% done. Time elapsed: 134205s. ETA: 53149s.Processed 261802 groups out of 357830. 73% done. Time elapsed: 134206s. ETA: 49226s.Processed 267395 groups out of 357830. 75% done. Time elapsed: 134207s. ETA: 45389s.Processed 272717 groups out of 357830. 76% done. Time elapsed: 134208s. ETA: 41885s.Processed 277537 groups out of 357830. 78% done. Time elapsed: 134209s. ETA: 38827s.Processed 282890 groups out of 357830. 79% done. Time elapsed: 134210s. ETA: 35553s.Processed 288017 groups out of 357830. 80% done. Time elapsed: 134211s. ETA: 32531s.Processed 292863 groups out of 357830. 82% done. Time elapsed: 134212s. ETA: 29772s.Processed 297737 groups out of 357830. 83% done. Time elapsed: 134213s. ETA: 27088s.Processed 303036 groups out of 357830. 85% done. Time elapsed: 134214s. ETA: 24268s.Processed 308200 groups out of 357830. 86% done. Time elapsed: 134215s. ETA: 21612s.Processed 313555 groups out of 357830. 88% done. Time elapsed: 134216s. ETA: 18951s.Processed 318869 groups out of 357830. 89% done. Time elapsed: 134217s. ETA: 16399s.Processed 324224 groups out of 357830. 91% done. Time elapsed: 134218s. ETA: 13911s.Processed 329423 groups out of 357830. 92% done. Time elapsed: 134219s. ETA: 11574s.Processed 334616 groups out of 357830. 94% done. Time elapsed: 134220s. ETA: 9311s.Processed 339993 groups out of 357830. 95% done. Time elapsed: 134221s. ETA: 7041s.Processed 345409 groups out of 357830. 97% done. Time elapsed: 134222s. ETA: 4826s.Processed 350640 groups out of 357830. 98% done. Time elapsed: 134223s. ETA: 2752s.Processed 355984 groups out of 357830. 99% done. Time elapsed: 134224s. ETA: 696s.Processed 357830 groups out of 357830. 100% done. Time elapsed: 134225s. ETA: 0s.
These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
# pred_opt = numeric(), pred_null = numeric()#,
# #pred_nopenalty = numeric()
# )
## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]
f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))
X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])
# now, loop through so with each iteration, a different ecoregion is held out
for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){
# split into training and test sets
test_eco <- i_eco
print(test_eco)
# identify the rowID of observations to be in the training and test datasets
train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'
trainDat_all <- modDat_1_s %>%
slice(train) %>%
select(-newRegion)
testDat_all <- modDat_1_s %>%
slice(test) %>%
select(-newRegion)
# get the model matrices for input and response variables for cross validation model specification
X_train <- as.matrix(X_cv[train,])
X_test <- as.matrix(X_cv[test,])
y_train <- modDat_1_s[train,response]
y_test <- modDat_1_s[test,response]
# get the model matrices for input and response variables for original model specification
X_train_glob <- as.matrix(X[train,])
X_test_glob <- as.matrix(X[test,])
y_train_glob <- modDat_1_s[train,response]
y_test_glob <- modDat_1_s[test,response]
train_eco <- modDat_1_s$NA_L2NAME[train]
## just try a regular glm w/ the components from the global model
fit_i <- glm(data = trainDat_all, formula = f_cv,
,
family = stats::Gamma(link = "log")
)
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
)
# null model and predictions
# the "null" model in this case is the global model
# predict on the test data for this iteration w/ the global model
null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response")
# save data
tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
pred_opt=optimal_pred, pred_null=null_pred#,
#pred_nopenalty=nopen_pred
) %>%
cbind(testDat_all)
# calculate RMSE, bias, etc. of
# RMSE of CV model
RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, y_test), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
# RMSE of global model
RMSE_null <- yardstick::rmse(data = data.frame(null_pred, y_test), truth = "y_test", estimate = "null_pred")[1,]$.estimate
# bias of CV model
bias_optimal <- mean(y_test - optimal_pred)
# bias of global model
bias_null <- mean( y_test - null_pred )
# put output into a list
tmpList <- list("testRegion" = i_eco,
"modelObject" = fit_i,
"modelPredictions" = tmp,
"performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal,
"RMSE_globalModel" = RMSE_null,
"bias_cvModel" = bias_optimal,
"bias_globalModel" = bias_null))
# save model outputs
outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
}
}
## [1] "ATLANTIC HIGHLANDS"
## [1] "CENTRAL USA PLAINS"
## [1] "EVERGLADES"
## [1] "MARINE WEST COAST FOREST"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WESTERN CORDILLERA"
# table of model performance
map(outList, .f = function(x) {
cbind(data.frame("holdout region" = x$testRegion), x$performanceMetrics)
}
) %>%
purrr::list_rbind() %>%
kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model",
"bias of CV model - mean(obs-pred.)", "bias of global model- mean(obs-pred.)"),
caption = "Performance of Cross Validation using 'best lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Held-out ecoregion | RMSE of CV model | RMSE of global model | bias of CV model - mean(obs-pred.) | bias of global model- mean(obs-pred.) |
|---|---|---|---|---|
| ATLANTIC HIGHLANDS | 27.44019 | 25.21662 | -0.695640 | -2.5368978 |
| CENTRAL USA PLAINS | 33.42218 | 33.07541 | -8.513419 | -7.5844706 |
| EVERGLADES | 34.33066 | 33.51991 | 5.880000 | 3.3456647 |
| MARINE WEST COAST FOREST | 36.28090 | 34.93893 | 5.734670 | 1.2153780 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 44.54304 | 30.06918 | -17.415484 | -0.0801357 |
| MIXED WOOD PLAINS | 31.34508 | 30.80956 | -2.183171 | -1.6454635 |
| MIXED WOOD SHIELD | 30.75344 | 29.27218 | 1.341574 | -0.3629509 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 28.55973 | 27.69727 | 7.053708 | 3.2203373 |
| SOUTHEASTERN USA PLAINS | 32.90499 | 31.38524 | -4.892493 | -1.3007597 |
| UPPER GILA MOUNTAINS | 16.71558 | 15.70599 | 5.244781 | 2.0314299 |
| WESTERN CORDILLERA | 29.90159 | 25.00284 | -10.406719 | -1.0459940 |
# visualize model predictions
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
holdoutRegion <- outList[[i]]$testRegion
predictionData <- outList[[i]]$modelPredictions
modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
as.data.frame() %>%
filter(V1!=0) %>%
rownames()
# calculate residuals
predictionData <- predictionData %>%
mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
resid_globMod = .[["obs"]] - .[["pred_null"]])
# rasterize
# use 'test_rast' from earlier
# rasterize data
plotObs <- predictionData %>%
drop_na(paste(response)) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- predictionData %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
# make histogram
hist_i <- ggplot(predictionData) +
geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <- ggplot() +
geom_spatraster(data = plotObs_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100, 100)) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
assign(paste0("residPlot_",holdoutRegion),
value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)
}
lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
get(paste0("residPlot_", x))
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion
if (length(prednames_fig_1se) == 0) {
print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
# pred_opt = numeric(), pred_null = numeric()#,
# #pred_nopenalty = numeric()
# )
## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_se, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]
f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))
X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])
# now, loop through so with each iteration, a different ecoregion is held out
for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){
# split into training and test sets
test_eco <- i_eco
print(test_eco)
# identify the rowID of observations to be in the training and test datasets
train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'
trainDat_all <- modDat_1_s %>%
slice(train) %>%
select(-newRegion)
testDat_all <- modDat_1_s %>%
slice(test) %>%
select(-newRegion)
# get the model matrices for input and response variables for cross validation model specification
X_train <- as.matrix(X_cv[train,])
X_test <- as.matrix(X_cv[test,])
y_train <- modDat_1_s[train,response]
y_test <- modDat_1_s[test,response]
# get the model matrices for input and response variables for original model specification
X_train_glob <- as.matrix(X[train,])
X_test_glob <- as.matrix(X[test,])
y_train_glob <- modDat_1_s[train,response]
y_test_glob <- modDat_1_s[test,response]
train_eco <- modDat_1_s$NA_L2NAME[train]
## just try a regular glm w/ the components from the global model
fit_i <- glm(data = trainDat_all, formula = f_cv,
,
family = stats::Gamma(link = "log")
)
coef(fit_i)
# lasso model predictions with the optimal lambda
optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
)
# null model and predictions
# the "null" model in this case is the global model
# predict on the test data for this iteration w/ the global model
null_pred <- predict.glm(fit_glm_se, newdata = testDat_all, type = "response")
# save data
tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
pred_opt=optimal_pred, pred_null=null_pred#,
#pred_nopenalty=nopen_pred
) %>%
cbind(testDat_all)
# calculate RMSE, bias, etc. of
# RMSE of CV model
RMSE_optimal <- yardstick::rmse(data = data.frame(optimal_pred, y_test), truth = "y_test", estimate = "optimal_pred")[1,]$.estimate
# RMSE of global model
RMSE_null <- yardstick::rmse(data = data.frame(null_pred, y_test), truth = "y_test", estimate = "null_pred")[1,]$.estimate
# bias of CV model
bias_optimal <- mean(y_test - optimal_pred)
# bias of global model
bias_null <- mean(y_test - null_pred )
# put output into a list
tmpList <- list("testRegion" = i_eco,
"modelObject" = fit_i,
"modelPredictions" = tmp,
"performanceMetrics" = data.frame("RMSE_cvModel" = RMSE_optimal,
"RMSE_globalModel" = RMSE_null,
"bias_cvModel" = bias_optimal,
"bias_globalModel" = bias_null))
# save model outputs
outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
}
}
## [1] "ATLANTIC HIGHLANDS"
## [1] "CENTRAL USA PLAINS"
## [1] "EVERGLADES"
## [1] "MARINE WEST COAST FOREST"
## [1] "MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS"
## [1] "MIXED WOOD PLAINS"
## [1] "MIXED WOOD SHIELD"
## [1] "OZARK/OUACHITA-APPALACHIAN FORESTS"
## [1] "SOUTHEASTERN USA PLAINS"
## [1] "UPPER GILA MOUNTAINS"
## [1] "WESTERN CORDILLERA"
if (length(prednames_fig_1se) == 0) {
print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
# table of model performance
map(outList, .f = function(x) {
cbind(data.frame("holdout region" = x$testRegion), x$performanceMetrics)
}
) %>%
purrr::list_rbind() %>%
kable(col.names = c("Held-out ecoregion", "RMSE of CV model", "RMSE of global model",
"bias of CV model - mean(obs-pred.)", "bias of global model - mean(obs-pred.)"),
caption = "Performance of Cross Validation using '1 SE lambda' model specification") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
}
| Held-out ecoregion | RMSE of CV model | RMSE of global model | bias of CV model - mean(obs-pred.) | bias of global model - mean(obs-pred.) |
|---|---|---|---|---|
| ATLANTIC HIGHLANDS | 34.50847 | 30.72350 | -12.2719658 | -8.2749819 |
| CENTRAL USA PLAINS | 33.10996 | 33.05693 | -3.9235085 | -3.6723964 |
| EVERGLADES | 35.49564 | 35.39701 | 4.6212820 | 3.9904358 |
| MARINE WEST COAST FOREST | 37.93306 | 36.88119 | 3.5336398 | 0.2387383 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 31.87972 | 31.19780 | -0.7498428 | -2.6558560 |
| MIXED WOOD PLAINS | 30.76467 | 30.65448 | 2.5426851 | 1.9009255 |
| MIXED WOOD SHIELD | 31.27451 | 30.52851 | 8.0331063 | 3.7060582 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 29.83579 | 28.76199 | 11.4003714 | 7.5524034 |
| SOUTHEASTERN USA PLAINS | 33.78513 | 31.97228 | -9.2622602 | -3.9721436 |
| UPPER GILA MOUNTAINS | 16.15022 | 15.97474 | 2.8694803 | 2.0921625 |
| WESTERN CORDILLERA | 28.19599 | 26.20124 | -8.3782675 | -1.2313598 |
if (length(prednames_fig_1se) == 0) {
print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {
for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
holdoutRegion <- outList[[i]]$testRegion
predictionData <- outList[[i]]$modelPredictions
modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
as.data.frame() %>%
filter(V1!=0) %>%
rownames()
# calculate residuals
predictionData <- predictionData %>%
mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
resid_globMod = .[["obs"]] - .[["pred_null"]])
# rasterize
# use 'test_rast' from earlier
# rasterize data
plotObs <- predictionData %>%
drop_na(paste(response)) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) #%>%
#terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
#terra::crop(ext(-1950000, 1000000, -1800000, 1000000))
tempExt <- crds(plotObs, na.rm = TRUE)
plotObs_2 <- plotObs %>%
crop(ext(min(tempExt[,1]), max(tempExt[,1]),
min(tempExt[,2]), max(tempExt[,2]))
)
# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>%
filter(resid > 100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
badResids_low <- predictionData %>%
filter(resid < -100) %>%
terra::vect(geom = c("Long", "Lat")) %>%
terra::set.crs(crs(test_rast))
# make figures
# make histogram
hist_i <- ggplot(predictionData) +
geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <- ggplot() +
geom_spatraster(data = plotObs_2) +
geom_sf(data = mapRegions, fill = NA, col = "rosybrown4", lwd = .5) +
geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
geom_sf(data = badResids_high, col = "blue") +
geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
scale_fill_gradient2(low = "red",
mid = "white" ,
high = "blue" ,
midpoint = 0, na.value = "lightgrey",
limits = c(-100, 100)) +
xlim(st_bbox(plotObs_2)[c(1,3)]) +
ylim(st_bbox(plotObs_2)[c(2,4)])
assign(paste0("residPlot_",holdoutRegion),
value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)
}
lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
get(paste0("residPlot_", x))
})
}
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
# # glm models
# #mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
#
# #mods2save$formula <- best_form
# #mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
# #n <- nrow(df_sample)
# #mods2save$data_rows <- n
#
#
# if(!test_run) {
# saveRDS(mods2save,
# paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n,
# #sample_group,
# ".RDS"))
# if (byRegion == TRUE) {
# ## western forests
# saveRDS(mods2save_WF,
# paste0("./models/glm_beta_model_WesternForests_", s, "_n", n,
# #sample_group,
# ".RDS"))
# ## eastern forests
# saveRDS(mods2save_EF,
# paste0("./models/glm_beta_model_EasternForests_", s, "_n", n,
# #sample_group,
# ".RDS"))
# ## grass/shrub
# saveRDS(mods2save_G,
# paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n,
# #sample_group,
# ".RDS"))
# }
# }
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)
#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)
#caret::varImp(fit)
Hash of current commit (i.e. to ID the version of the code used)
system("git rev-parse HEAD", intern=TRUE)
## [1] "79890c55a196d40eb16ae968701c4515b44c260c"
Packages etc.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.7.5
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] doMC_1.3.8 iterators_1.0.14 foreach_1.5.2 ggpubr_0.6.0 factoextra_1.0.7
## [6] USA.state.boundaries_1.0.1 glmnet_4.1-8 Matrix_1.7-0 kableExtra_1.4.0 rsample_1.2.1
## [11] here_1.0.1 StepBeta_2.1.0 ggtext_0.1.2 knitr_1.49 gridExtra_2.3
## [16] pdp_0.8.2 GGally_2.2.1 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [21] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [26] tidyverse_2.0.0 caret_6.0-94 lattice_0.22-6 ggplot2_3.5.1 sf_1.0-20
## [31] tidyterra_0.6.1 terra_1.8-21 ggspatial_1.1.9 dtplyr_1.3.1 patchwork_1.3.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_1.9.1 shape_1.4.6.1 magrittr_2.0.3 modeltools_0.2-23
## [7] farver_2.1.2 rmarkdown_2.29 vctrs_0.6.5 rstatix_0.7.2 htmltools_0.5.8.1 broom_1.0.7
## [13] Formula_1.2-5 pROC_1.18.5 sass_0.4.9 parallelly_1.37.1 KernSmooth_2.23-22 bslib_0.9.0
## [19] plyr_1.8.9 sandwich_3.1-0 zoo_1.8-12 cachem_1.1.0 commonmark_1.9.1 lifecycle_1.0.4
## [25] pkgconfig_2.0.3 R6_2.6.1 fastmap_1.2.0 future_1.33.2 digest_0.6.37 colorspace_2.1-1
## [31] furrr_0.3.1 rprojroot_2.0.4 pkgload_1.3.4 labeling_0.4.3 yardstick_1.3.1 timechange_0.3.0
## [37] mgcv_1.9-1 abind_1.4-8 compiler_4.4.0 proxy_0.4-27 aod_1.3.3 withr_3.0.2
## [43] backports_1.5.0 carData_3.0-5 betareg_3.1-4 DBI_1.2.3 ggstats_0.9.0 ggsignif_0.6.4
## [49] MASS_7.3-60.2 lava_1.8.0 classInt_0.4-10 gtools_3.9.5 ModelMetrics_1.2.2.2 tools_4.4.0
## [55] units_0.8-5 lmtest_0.9-40 future.apply_1.11.2 nnet_7.3-19 glue_1.8.0 nlme_3.1-164
## [61] gridtext_0.1.5 grid_4.4.0 reshape2_1.4.4 generics_0.1.3 recipes_1.1.0 gtable_0.3.6
## [67] tzdb_0.4.0 class_7.3-22 data.table_1.17.0 hms_1.1.3 utf8_1.2.4 car_3.1-2
## [73] xml2_1.3.7 flexmix_2.3-19 markdown_1.13 ggrepel_0.9.5 pillar_1.10.1 splines_4.4.0
## [79] survival_3.5-8 tidyselect_1.2.1 svglite_2.1.3 stats4_4.4.0 xfun_0.51 hardhat_1.4.0
## [85] timeDate_4032.109 stringi_1.8.4 yaml_2.3.10 evaluate_1.0.3 codetools_0.2-20 cli_3.6.4
## [91] rpart_4.1.23 systemfonts_1.2.1 munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.14 globals_0.16.3
## [97] gower_1.0.1 listenv_0.9.1 viridisLite_0.4.2 ipred_0.9-15 scales_1.3.0 prodlim_2024.06.25
## [103] e1071_1.7-14 crayon_1.5.3 combinat_0.0-8 rlang_1.1.5 cowplot_1.1.3